Prepared for: Dr. Dennis Doe
Analysis by: Yong Won Jin
Date modified: 2022-06-19
import os
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
1.1 Load data
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')
pbmc33k = sc.read_10x_mtx('./pbmc33k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19/',
var_names='gene_symbols')
pbmc33k
1.2 Exploratory data analysis
pbmc33k.to_df().iloc[0:5,0:5]
pbmc33k.obs.head()
pbmc33k.var.head()
sc.pl.highest_expr_genes(pbmc33k, n_top=20)
# Search for prefix "MT-" (mitochondrial genes) and make new column in variable annotations
# Search for prefix "RPL/RPS" for ribosomal genes and "MRPL/MRPS" for mitochondrial ribosomal genes
pbmc33k.var['mito'] = pbmc33k.var_names.str.startswith('MT-')
pbmc33k.var['ribo'] = pbmc33k.var_names.str.startswith(('RPL','RPS'))
pbmc33k.var['mribo'] = pbmc33k.var_names.str.startswith(('MRPL','MRPS'))
# Calculate QC metrics as per McCarthy et al., 2017 (Scater)
sc.pp.calculate_qc_metrics(pbmc33k, qc_vars=['mito','ribo','mribo'], log1p=False, inplace=True)
pbmc33k.raw = pbmc33k
pbmc33k.obs.head()
sns.jointplot(x='total_counts', y='n_genes_by_counts', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_mito', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_ribo', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_mribo', height=8, data=pbmc33k.obs,
kind='scatter')
plt.show()
pbmc33k.var.head()
sc.pl.scatter(pbmc33k, x='mean_counts', y='n_cells_by_counts')
1.3 Pre-processing
sc.pp.filter_cells(pbmc33k, min_genes=200)
sc.pp.filter_genes(pbmc33k, min_cells=3)
pbmc33k = pbmc33k[pbmc33k.obs.n_genes_by_counts < 2500, :]
pbmc33k = pbmc33k[pbmc33k.obs.pct_counts_mito < 5, :]
pbmc33k = pbmc33k[pbmc33k.obs.pct_counts_mribo < 1, :]
sns.jointplot(x='total_counts', y='n_genes_by_counts', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_mito', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_ribo', height=8, data=pbmc33k.obs,
kind='scatter')
sns.jointplot(x='total_counts', y='pct_counts_mribo', height=8, data=pbmc33k.obs,
kind='scatter')
plt.show()
sc.pp.normalize_total(pbmc33k, target_sum=1e4)
sc.pp.log1p(pbmc33k)
sc.pp.highly_variable_genes(pbmc33k, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(pbmc33k)
pbmc33k.raw = pbmc33k
pbmc33k = pbmc33k[:, pbmc33k.var.highly_variable]
sc.pp.regress_out(pbmc33k, ['total_counts','pct_counts_mito'])
sc.pp.scale(pbmc33k, max_value=10)
sc.tl.pca(pbmc33k, svd_solver='arpack')
# pbmc33k.var_names[pbmc33k.var_names.str.startswith('GZMB')]
sc.pl.pca(pbmc33k, color=['CD3D', 'CD19','CD14','GZMB'])
sc.pl.pca_variance_ratio(pbmc33k, log=True)
sc.pp.neighbors(pbmc33k, n_neighbors=10, n_pcs=10)
sc.tl.umap(pbmc33k)
sc.pl.umap(pbmc33k, color=['CD3D', 'CD19','CD14','GZMB'])
sc.pl.umap(pbmc33k, color=['CD3D', 'CD19','CD14','GZMB'], use_raw=False)
sc.tl.leiden(pbmc33k)
sc.pl.umap(pbmc33k, color=['leiden'])
sc.tl.rank_genes_groups(pbmc33k, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(pbmc33k, n_genes=25, sharey=False)
pd.DataFrame(pbmc33k.uns['rank_genes_groups']['names']).head(10)
new_cluster_names = [
'B1', 'neutrophil1',
'CD4+ T1', 'CD8+ T1',
'NKT1', 'macrophage1',
'NKT2', 'macrophage2',
'NK1', 'CD4+ T2',
'T1', 'CD8+ T2', 'NK2',
'CD8+ T3', 'T2',
'dendritic cell1', 'B2',
'megakaryocyte', 'neutrophil2',
'dendritic cell2', 'B3',
'NKT3'
]
pbmc33k.rename_categories('leiden', new_cluster_names)
sc.pl.umap(pbmc33k, color='leiden', title='')
pbmc33k.obs
pbmc33k.obs['celltype'] = pbmc33k.obs['leiden'].str.extract(r'^([A-Za-z\+0-9 ]+)(?<!\d{1}$)', expand=False)
sc.pl.umap(pbmc33k, color='celltype', title='')
marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
sc.pl.dotplot(pbmc33k, marker_genes, groupby='celltype')
sc.pl.stacked_violin(pbmc33k, marker_genes, groupby='celltype', rotation=90);
sc.pl.heatmap(pbmc33k, marker_genes, groupby='celltype', swap_axes=True)